In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import matplotlib
%matplotlib inline

See a few samples of data


In [2]:
!head ../data/vgsales.csv

#!more ../data/vgsales.csv


Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,
1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,
2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,
3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,
4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33,
5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1,31.37,
6,Tetris,GB,1989,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26,
7,New Super Mario Bros.,DS,2006,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01,
8,Wii Play,Wii,2006,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02,
9,New Super Mario Bros. Wii,Wii,2009,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62,

Load data


In [3]:
# read data from csv file
data = pd.read_csv("../data/vgsales.csv", 
                   warn_bad_lines = True, 
                   error_bad_lines = False, 
                   verbose = True)
print data.info()
print "\n\n data shape: ", data.shape


Tokenization took: 201.46 ms
Type conversion took: 11.39 ms
Parser memory cleanup took: 0.01 ms
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 12 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16328 non-null float64
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
Unnamed: 11     0 non-null float64
dtypes: float64(7), int64(1), object(4)
memory usage: 1.5+ MB
None


 data shape:  (16598, 12)

In [4]:
# drop first & last coloumn
data.drop(data.columns[[0, 11]], axis = 1, inplace = True)
data["Year"] = data["Year"].astype('category')
print data.info()
print data.shape
# print data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 10 columns):
Name            16598 non-null object
Platform        16598 non-null object
Year            16328 non-null category
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: category(1), float64(5), object(4)
memory usage: 1.2+ MB
None
(16598, 10)

missing value statistics


In [5]:
def count_missing(x):
  return sum(x.isnull()) / float(len(x))

# missing value info
print "Missing Value Statistics"
print data.apply(count_missing, axis = 0)


Missing Value Statistics
Name            0.000000
Platform        0.000000
Year            0.016267
Genre           0.000000
Publisher       0.003494
NA_Sales        0.000000
EU_Sales        0.000000
JP_Sales        0.000000
Other_Sales     0.000000
Global_Sales    0.000000
dtype: float64

Coherence Validation


In [6]:
data[(data['Global_Sales'] < (data['NA_Sales'] + data['EU_Sales'] + data['JP_Sales'] + data['Other_Sales']) - 0.02)]


Out[6]:
Name Platform Year Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
105 Tekken 3 PS 1998.0 Fighting Sony Computer Entertainment 3.27 2.22 1.4 0.29 7.16

Central Tendency


In [7]:
# basic descriptive statistics, here 50% quartile is just the median
data.describe(percentiles = [0.25,0.5,0.75,0.997], 
              include = 'all')


Out[7]:
Name Platform Year Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
count 16598 16598 16328.0 16598 16540 16598.000000 16598.000000 16598.000000 16598.000000 16598.000000
unique 11494 31 39.0 12 578 NaN NaN NaN NaN NaN
top Need for Speed: Most Wanted DS 2009.0 Action Electronic Arts NaN NaN NaN NaN NaN
freq 12 2163 1431.0 3316 1351 NaN NaN NaN NaN NaN
mean NaN NaN NaN NaN NaN 0.264668 0.146653 0.077780 0.048063 0.537441
std NaN NaN NaN NaN NaN 0.816683 0.505351 0.309291 0.188588 1.555028
min NaN NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000 0.010000
25% NaN NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000 0.060000
50% NaN NaN NaN NaN NaN 0.080000 0.020000 0.000000 0.010000 0.170000
75% NaN NaN NaN NaN NaN 0.240000 0.110000 0.040000 0.040000 0.470000
99.7% NaN NaN NaN NaN NaN 5.619710 3.540450 2.682090 1.230000 11.211350
max NaN NaN NaN NaN NaN 41.490000 29.020000 10.220000 10.570000 82.740000

Spread


In [8]:
data.skew(axis = 0)


Out[8]:
NA_Sales        18.799639
EU_Sales        18.875551
JP_Sales        11.206441
Other_Sales     24.233923
Global_Sales    17.400645
dtype: float64

In [9]:
data.kurt(axis = 0)


Out[9]:
NA_Sales         649.130824
EU_Sales         756.028720
JP_Sales         194.233495
Other_Sales     1025.348144
Global_Sales     603.932346
dtype: float64

Box Plot


In [10]:
data.plot.box()


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4f9ed8cfd0>

In [11]:
data2 = data.loc[(data['NA_Sales'] <= 1) & (data['EU_Sales'] <= 1) &
                 (data['JP_Sales'] <= 1) & (data['Other_Sales'] <= 1) &
                 (data['Global_Sales'] <= 1)]
data2.describe(percentiles = [0.25,0.5,0.75,0.997], include = 'all')


Out[11]:
Name Platform Year Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
count 14544 14544 14296.0 14544 14489 14544.000000 14544.000000 14544.000000 14544.000000 14544.000000
unique 10485 31 39.0 12 570 NaN NaN NaN NaN NaN
top LEGO The Hobbit DS 2009.0 Action Electronic Arts NaN NaN NaN NaN NaN
freq 8 2021 1315.0 2894 1013 NaN NaN NaN NaN NaN
mean NaN NaN NaN NaN NaN 0.111285 0.053524 0.039102 0.018190 0.222421
std NaN NaN NaN NaN NaN 0.145362 0.089619 0.098900 0.033426 0.227924
min NaN NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000 0.010000
25% NaN NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000 0.050000
50% NaN NaN NaN NaN NaN 0.060000 0.020000 0.000000 0.010000 0.140000
75% NaN NaN NaN NaN NaN 0.160000 0.060000 0.030000 0.020000 0.320000
99.7% NaN NaN NaN NaN NaN 0.770000 0.530000 0.710000 0.220000 0.990000
max NaN NaN NaN NaN NaN 0.940000 0.920000 1.000000 0.740000 1.000000

In [12]:
data2.boxplot(figsize = (8,8))


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4f9eba7350>

Histogram


In [13]:
data['NA_Sales'].hist(bins = 50)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4f9ea03050>

Categorical Analysis


In [14]:
data['Genre'].unique()


Out[14]:
array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [15]:
genre_cnt = data['Genre'].value_counts(sort = False)
print genre_cnt
genre_freq = genre_cnt / float(data.shape[0])
print genre_freq


Role-Playing    1488
Simulation       867
Sports          2346
Platform         886
Puzzle           582
Shooter         1309
Fighting         848
Action          3316
Misc            1739
Strategy         682
Adventure       1286
Racing          1249
Name: Genre, dtype: int64
Role-Playing    0.089649
Simulation      0.052235
Sports          0.141342
Platform        0.053380
Puzzle          0.035064
Shooter         0.078865
Fighting        0.051090
Action          0.199783
Misc            0.104772
Strategy        0.041089
Adventure       0.077479
Racing          0.075250
Name: Genre, dtype: float64

In [16]:
fig, ax = plt.subplots(figsize=(8,5))
genre_cnt.plot(kind = 'bar', ax = ax, rot = 90)
plt.title('Genre Distribution', fontsize = 15)
plt.xlabel('Genre', fontsize = 15)
plt.ylabel('Sales Number', fontsize = 15)


Out[16]:
<matplotlib.text.Text at 0x7f4f9e8921d0>

In [17]:
fig, ax = plt.subplots(figsize=(18,15))
data.groupby(['Year', 'Platform']).sum().unstack().plot(y = 'Global_Sales', kind = 'bar', ax = ax, stacked = True, colormap = 'Paired')
plt.show()


Top-N Count Analysis


In [18]:
# print data['Genre'].dropna().tolist()
genre = Counter(data['Genre'].dropna().tolist())
total = sum(genre.values())
genre = genre.most_common() # parameter can be the N
N = len(genre)

print "%d categories with total %d records" % (N, total)

genre_name = [item[0] for item in genre]
genre_counts = [item[1] / float(total) for item in genre]
# print genre_counts

fig, ax = plt.subplots(figsize=(18,15))
sns.barplot(x = genre_name, y = genre_counts, ax = ax)
plt.title("Top-%d Genre" % (N), fontsize = 15)
plt.xlabel("Genre", fontsize = 15)
plt.ylabel("Top-%d Genre" % (N), fontsize = 15)
ticks = plt.setp(ax.get_xticklabels(), fontsize = 15, rotation = 60)


12 categories with total 16598 records

Top-N Sales


In [19]:
platforms = data['Platform'].unique()
platform_sales = []

for platform in platforms:
  platform_sales.append(data[data['Platform'] == platform]['Global_Sales'].dropna().sum())

fig, ax = plt.subplots(figsize = (18, 15))
sns.barplot(x = platforms, y = platform_sales, ax = ax, palette = sns.color_palette("PuBu", 10))

plt.title("Platform Sales", fontsize = 15)
plt.xlabel("Platform Category", fontsize = 15)
plt.ylabel("Total Sales", fontsize = 15)
ticks = plt.setp(ax.get_xticklabels(), fontsize = 15, rotation = 60)


Heat Map


In [20]:
table_sales = pd.pivot_table(data, values = ['Global_Sales'], index = ['Platform'], columns = ['Genre'], aggfunc = np.mean, fill_value = 0, margins = False)

fig, ax = plt.subplots(figsize = (18, 15))
sns.heatmap(table_sales, linewidth = .5, annot = True, vmin = 0.01, fmt = '.2f', cmap = 'PuBu')
plt.title("Platform-Genre Sales", fontsize = 15)
# ticks_y = plt.setp(ax.get_yticklabels(), fontsize = 15)
ticks_x = plt.setp(ax.get_xticklabels(), rotation = 60)
print ax.get_xticklabels()


<a list of 12 Text xticklabel objects>

In [21]:
table_sales = pd.pivot_table(data, values = ['Global_Sales'], index = ['Platform'], columns = ['Genre'], aggfunc = 'count', fill_value = 0, margins = False)
# print table_sales

fig, ax = plt.subplots(figsize = (18, 15))
sns.heatmap(table_sales, linewidth = .5, annot = True, vmin = 0, fmt = '2.0f', cmap = 'PuBu')
plt.title("Platform-Genre Sales", fontsize = 15)
# ticks_y = plt.setp(ax.get_yticklabels(), fontsize = 15)
# ticks = plt.setp(ax.get_xticklabels(), fontsize = 15, rotation = 60)


Out[21]:
<matplotlib.text.Text at 0x7f4f9d24e610>

Word Cloud


In [22]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

stopwords = set(STOPWORDS)

for genre in data.Genre.unique():
    # print data.Name[data.Genre == genre].to_string()
    if genre == "Sports":
      wc = WordCloud(background_color = "white", max_font_size = 40, max_words = 200, stopwords = stopwords, random_state = 42)
      wc.generate(data.Name[data.Genre == genre].to_string())
      plt.imshow(wc)
      plt.title(genre)
      plt.axis("off")
      plt.show()


Pie Chart


In [23]:
fig, ax = plt.subplots(figsize = (8, 8))
publisher = data.groupby('Publisher').sum()['Global_Sales']
publisher.sort_values(ascending = False)[:10].plot.pie()
ax.set_ylabel("")
plt.tight_layout()

# print publisher